import pandas as pd
df = pd.read_excel('Clusteranalyse1.xlsx')
df.columns
Index(['ID', 'sex_wm', 'gms', 'hs', 'rs', 'gym', 'sbbz', 'school_full',
'birthcountry_out', 'migra', 'job_1', 'job_2', 'engagement',
'enga_barrier', 'ZielStudium', 'ZielAbi', 'ZielReal', 'ZielHaupt',
'ZielAndere', 'ZielWeißNicht', 'Lehre', 'studieren', 'arbeiten',
'weiterfSchule', 'Praktikum', 'jobben', 'BFD.FSJde', 'BFD.FSJaus',
'Ausland', 'Auslandgrob', 'Bundeswehr', 'Anderes', 'WeißNicht',
'asp_edu_3', 'Technik', 'Handwerk', 'Verkehr', 'Soziales', 'IT',
'Kunst', 'Verkauf', 'Maschinenbau', 'Gesundheit', 'Koerperpflege',
'Natur.LW', 'Bau', 'Verwaltung', 'Medien', 'Produktion', 'Recht',
'Sicherheit', 'Sport', 'worry_job_rec', 'worry_job_dicho', 'internet',
'well_1', 'well_2', 'well_3', 'efficacy', 'anxiety_dicho',
'mentalhealth'],
dtype='object')
def f(row):
if row['gms'] == 1:
val = 'gms'
elif row['hs'] == 1:
val = 'hs'
elif row['rs'] == 1:
val = 'rs'
elif row['gym'] == 1:
val = 'gym'
else:
val = 'kA'
return val
df['Schultyp']= df.apply(f, axis=1)
import plotly.express as px
data = df.dropna()
px.histogram(data, color='migra',x='worry_job_rec',barmode='group',barnorm='percent')
data = df.dropna()
data['worry_job_rec'] = data['worry_job_rec'].astype(str)
px.histogram(data, x='Schultyp',color='worry_job_rec',barmode='group',barnorm='percent')
<ipython-input-7-ceeda5fecd14>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
data = df.dropna()
px.histogram(data, x='Schultyp',color='Ausland',barmode='group',barnorm='fraction')
data = df.dropna()
px.histogram(data, x='Schultyp',color='well_2',barmode='group',barnorm='fraction')
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(15,20))
corr = df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, cmap="RdBu", mask = mask)
<AxesSubplot:>
# Prepare a vector of color mapped to the 'cyl' column
# my_palette = dict(zip(df.cyl.unique(), ["orange","yellow","brown"]))
# row_colors = df.cyl.map(my_palette)
# # plot
# sns.clustermap(df, metric="correlation", method="single", cmap="Blues", standard_scale=1)
# plt.show()
data = df.dropna()
px.histogram(data, color='sex_wm',x='Soziales',barmode='group', barnorm= 'fraction')
X = df.iloc[:,2:61]
X = X.dropna()
import numpy as np
from sklearn.decomposition import PCA
pca = PCA(n_components = 20, random_state=1)
X_pca = pca.fit_transform(X)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)
px.area(
x=range(1, exp_var_cumul.shape[0] + 1),
y=exp_var_cumul,
labels={"x": "# Components", "y": "Explained Variance"}
)
Variance explained
from sklearn.cluster import KMeans
distortions = []
K_to_try = range(1, 6)
for i in K_to_try:
model = KMeans(
n_clusters=i,
init='k-means++',
n_jobs=-1,
random_state=1)
model.fit(X_pca)
distortions.append(model.inertia_)
plt.plot(K_to_try, distortions, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Distortion')
plt.show()
C:\Users\user\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:792: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 1.0 (renaming of 0.25). C:\Users\user\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:792: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 1.0 (renaming of 0.25). C:\Users\user\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:792: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 1.0 (renaming of 0.25). C:\Users\user\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:792: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 1.0 (renaming of 0.25). C:\Users\user\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:792: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 1.0 (renaming of 0.25).
# use the best K from elbow method
model = KMeans(
n_clusters=3,
init='k-means++',
n_jobs=-1,
random_state=1)
model = model.fit(X_pca)
y = model.predict(X_pca)
C:\Users\user\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:792: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 1.0 (renaming of 0.25).
pca = PCA(n_components = 2, random_state=1)
X_pca = pca.fit_transform(X)
dfPCA = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2'])
dfPCA['color']=y
# dfPCA['color1']=y_final
px.scatter(dfPCA,x='PCA1',y='PCA2',color='color')
pca = PCA(n_components = 3, random_state=1)
X_pca = pca.fit_transform(X)
dfPCA1 = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2','PCA3'])
# y_final = model.predict(X_pca)
dfPCA1['color1']=y
dfPCA1['color1'] = dfPCA1['color1'].astype(str)
px.scatter_3d(dfPCA1,x='PCA1',y='PCA2',z='PCA3',color='color1')
model_k = model.fit(X)
y_final = model_k.predict(X)
C:\Users\user\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:792: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 1.0 (renaming of 0.25).
print(dfPCA.groupby(['color']).count())
PCA1 PCA2 color 0 840 840 1 741 741 2 143 143
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
features = X.columns
pca = PCA(n_components=2)
components = pca.fit_transform(X)
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
fig = px.scatter(components, x=0, y=1)
for i, feature in enumerate(features):
fig.add_shape(
type='line',
x0=0, y0=0,
x1=loadings[i, 0],
y1=loadings[i, 1]
)
fig.add_annotation(
x=loadings[i, 0],
y=loadings[i, 1],
ax=0, ay=0,
xanchor="center",
yanchor="bottom",
text=feature,
)
fig.show()
len(y_final)
1724
df.shape
(2160, 62)
data = df.copy()
data = data.dropna()
px.histogram(data, x='Schultyp',color='well_2',barmode='group',barnorm='fraction')
# Import required libraries
import pandas as pd
from sklearn.datasets import load_iris
from factor_analyzer import FactorAnalyzer
import matplotlib.pyplot as plt
X.shape
(1724, 59)
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
chi_square_value,p_value=calculate_bartlett_sphericity(X)
chi_square_value, p_value
(190341.51509927955, 0.0)
from factor_analyzer.factor_analyzer import calculate_kmo
kmo_all,kmo_model=calculate_kmo(X)
kmo_model
C:\Users\user\anaconda3\lib\site-packages\factor_analyzer\utils.py:249: UserWarning: The inverse of the variance-covariance matrix was calculated using the Moore-Penrose generalized matrix inversion, due to its determinant being at or very close to zero.
0.3897399379283963
# Create factor analysis object and perform factor analysis
fa = FactorAnalyzer(rotation=None,)
fa.fit(X, 25)
# Check Eigenvalues
ev, v = fa.get_eigenvalues()
ev
array([ 3.30624443e+00, 2.94107800e+00, 2.48809837e+00, 1.97513677e+00,
1.81723050e+00, 1.70948061e+00, 1.63810022e+00, 1.60547082e+00,
1.51653760e+00, 1.45879928e+00, 1.41060217e+00, 1.30319422e+00,
1.29749209e+00, 1.22368025e+00, 1.21602507e+00, 1.16282963e+00,
1.15148815e+00, 1.14337516e+00, 1.11995531e+00, 1.10718847e+00,
1.08751327e+00, 1.06836128e+00, 1.05079085e+00, 1.02074078e+00,
1.00888582e+00, 9.92392700e-01, 9.89911553e-01, 9.73987376e-01,
9.50059904e-01, 9.30959147e-01, 9.21358687e-01, 9.18329394e-01,
8.95080621e-01, 8.72857171e-01, 8.50361522e-01, 8.43169343e-01,
8.22970690e-01, 7.83786504e-01, 7.76023131e-01, 7.52296116e-01,
7.37883222e-01, 7.30028895e-01, 7.17510606e-01, 6.72775170e-01,
6.63162607e-01, 6.36490331e-01, 5.75534427e-01, 5.58128193e-01,
5.01935531e-01, 4.81383222e-01, 4.65650462e-01, 4.14401387e-01,
3.90249196e-01, 1.70146875e-01, 1.29246696e-01, 5.36301753e-02,
-7.62246098e-17, -1.77266240e-16, -2.10494663e-16])
# Create scree plot using matplotlib
plt.scatter(range(1,X.shape[1]+1),ev)
plt.plot(range(1,X.shape[1]+1),ev)
plt.title('Scree Plot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()
# Create factor analysis object and perform factor analysis
fa = FactorAnalyzer(n_factors=3, rotation="varimax")
fa = fa.fit(X)
fa.loadings_
Loading = pd. DataFrame(fa.loadings_, columns=['1', '2','3'])
Loading['Variable']= X.columns
Loading = Loading.set_index('Variable')
fig, ax = plt.subplots(figsize=(10,20))
sns.heatmap(Loading, cmap="RdBu",annot =True)
# Get variance of each factors
print(fa.get_factor_variance())
(array([2.60768088, 2.24879264, 2.10266123]), array([0.04419798, 0.03811513, 0.03563833]), array([0.04419798, 0.08231311, 0.11795144]))
# Get variance of each factors
fa.get_factor_variance()
(array([2.60768088, 2.24879264, 2.10266123]), array([0.04419798, 0.03811513, 0.03563833]), array([0.04419798, 0.08231311, 0.11795144]))
X1 = df[['sex_wm', 'gms', 'hs', 'rs', 'gym', 'sbbz', 'school_full',
'birthcountry_out', 'migra', 'job_1', 'job_2', 'engagement',
'enga_barrier', 'ZielWeißNicht', 'Lehre', 'studieren', 'arbeiten',
'weiterfSchule', 'Praktikum', 'jobben', 'BFD.FSJaus',
'Ausland', 'Auslandgrob', 'Bundeswehr', 'Anderes', 'WeißNicht',
'asp_edu_3', 'Technik', 'Handwerk', 'Verkehr', 'Soziales', 'IT',
'Kunst', 'Verkauf', 'Maschinenbau', 'Gesundheit', 'Koerperpflege',
'Natur.LW', 'Bau', 'Verwaltung', 'Medien', 'Produktion', 'Recht',
'Sicherheit', 'Sport', 'worry_job_rec', 'internet',
'well_1', 'well_2', 'well_3', 'efficacy', 'anxiety_dicho',
'mentalhealth']]
from factor_analyzer.factor_analyzer import calculate_kmo
kmo_all,kmo_model=calculate_kmo(X1)
print(kmo_model)
kmo_all,kmo_model=calculate_kmo(X)
print(kmo_model)
C:\Users\user\anaconda3\lib\site-packages\numpy\linalg\linalg.py:2159: RuntimeWarning: invalid value encountered in det
--------------------------------------------------------------------------- AssertionError Traceback (most recent call last) ~\anaconda3\lib\site-packages\factor_analyzer\utils.py in partial_correlations(x) 244 try: --> 245 assert np.linalg.det(x_cov) > np.finfo(np.float32).eps 246 icvx = np.linalg.inv(x_cov) AssertionError: During handling of the above exception, another exception occurred: LinAlgError Traceback (most recent call last) <ipython-input-39-003cba19c7a0> in <module> 1 from factor_analyzer.factor_analyzer import calculate_kmo ----> 2 kmo_all,kmo_model=calculate_kmo(X1) 3 print(kmo_model) 4 kmo_all,kmo_model=calculate_kmo(X) 5 print(kmo_model) ~\anaconda3\lib\site-packages\factor_analyzer\factor_analyzer.py in calculate_kmo(x) 53 54 # calculate the partial correlations ---> 55 partial_corr = partial_correlations(x) 56 57 # calcualte the pair-wise correlations ~\anaconda3\lib\site-packages\factor_analyzer\utils.py in partial_correlations(x) 246 icvx = np.linalg.inv(x_cov) 247 except AssertionError: --> 248 icvx = np.linalg.pinv(x_cov) 249 warnings.warn('The inverse of the variance-covariance matrix ' 250 'was calculated using the Moore-Penrose generalized ' <__array_function__ internals> in pinv(*args, **kwargs) ~\anaconda3\lib\site-packages\numpy\linalg\linalg.py in pinv(a, rcond, hermitian) 2001 return wrap(res) 2002 a = a.conjugate() -> 2003 u, s, vt = svd(a, full_matrices=False, hermitian=hermitian) 2004 2005 # discard small singular values <__array_function__ internals> in svd(*args, **kwargs) ~\anaconda3\lib\site-packages\numpy\linalg\linalg.py in svd(a, full_matrices, compute_uv, hermitian) 1659 1660 signature = 'D->DdD' if isComplexType(t) else 'd->ddd' -> 1661 u, s, vh = gufunc(a, signature=signature, extobj=extobj) 1662 u = u.astype(result_t, copy=False) 1663 s = s.astype(_realType(result_t), copy=False) ~\anaconda3\lib\site-packages\numpy\linalg\linalg.py in _raise_linalgerror_svd_nonconvergence(err, flag) 95 96 def _raise_linalgerror_svd_nonconvergence(err, flag): ---> 97 raise LinAlgError("SVD did not converge") 98 99 def _raise_linalgerror_lstsq(err, flag): LinAlgError: SVD did not converge
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
chi_square_value,p_value=calculate_bartlett_sphericity(X1)
chi_square_value, p_value
(nan, nan)
# Create factor analysis object and perform factor analysis
fa = FactorAnalyzer(rotation=None,)
fa.fit(X1, 25)
# Check Eigenvalues
ev, v = fa.get_eigenvalues()
ev
array([ 2.92146988e+00, 2.37518322e+00, 2.02689681e+00, 1.75401689e+00,
1.67206004e+00, 1.61190279e+00, 1.48656154e+00, 1.40466427e+00,
1.33178845e+00, 1.30349812e+00, 1.28008930e+00, 1.24913471e+00,
1.19163186e+00, 1.18668437e+00, 1.17800642e+00, 1.11452390e+00,
1.11192712e+00, 1.09319022e+00, 1.07061621e+00, 1.04160793e+00,
1.03977503e+00, 1.02686496e+00, 1.00764729e+00, 9.94948862e-01,
9.76776542e-01, 9.64071784e-01, 9.50772327e-01, 9.47137514e-01,
9.35164985e-01, 9.09689690e-01, 8.77470487e-01, 8.66747062e-01,
8.54014347e-01, 8.40541327e-01, 8.35779220e-01, 7.89570066e-01,
7.80155675e-01, 7.69536216e-01, 7.35504869e-01, 7.11570194e-01,
7.00373138e-01, 6.66751954e-01, 6.27757682e-01, 6.03362548e-01,
5.79225670e-01, 4.85929305e-01, 4.81490863e-01, 4.68202357e-01,
4.48200666e-01, 4.20995654e-01, 2.47751304e-01, 5.07663629e-02,
-8.19381689e-16])
# Create scree plot using matplotlib
plt.scatter(range(1,X1.shape[1]+1),ev)
plt.plot(range(1,X1.shape[1]+1),ev)
plt.title('Scree Plot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()
i = 17
fa = FactorAnalyzer(n_factors=i, rotation="varimax")
fa = fa.fit(X1)
fa.loadings_
Loading = pd. DataFrame(fa.loadings_, columns=list(range(0,i)))
Loading['Variable']= X1.columns
Loading = Loading.set_index('Variable')
fig, ax = plt.subplots(figsize=(10,20))
sns.heatmap(Loading, cmap="RdBu",annot =True)
# Get variance of each factors
print(fa.get_factor_variance())
(array([1.49394325, 1.34231375, 1.33279056, 1.24808313, 1.24536805,
1.24255694, 1.20933939, 1.1354486 , 1.13436864, 1.10899805,
1.07070873, 1.05941239, 1.02691843, 0.86639275, 0.75936075,
0.67262395, 0.64395185]), array([0.02818761, 0.02532667, 0.02514699, 0.02354874, 0.02349751,
0.02344447, 0.02281772, 0.02142356, 0.02140318, 0.02092449,
0.02020205, 0.01998891, 0.01937582, 0.01634703, 0.01432756,
0.01269102, 0.01215003]), array([0.02818761, 0.05351428, 0.07866127, 0.10221001, 0.12570752,
0.14915199, 0.17196972, 0.19339328, 0.21479646, 0.23572095,
0.255923 , 0.27591191, 0.29528773, 0.31163477, 0.32596233,
0.33865335, 0.35080338]))
fa = FactorAnalyzer(n_factors=2, rotation="varimax")
fa = fa.fit(X1)
fa.loadings_
Loading = pd. DataFrame(fa.loadings_, columns=['1', '2'])
Loading['Variable']= X1.columns
Loading = Loading.set_index('Variable')
fig, ax = plt.subplots(figsize=(10,20))
sns.heatmap(Loading, cmap="RdBu",annot =True)
# Get variance of each factors
print(fa.get_factor_variance())
(array([1.99135399, 1.7033916 ]), array([0.03757272, 0.03213946]), array([0.03757272, 0.06971218]))